Importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import scipy.misc
import matplotlib.pyplot as plt
from math import *

%matplotlib inline

K-Means Clustering


def findClosestCentroids(X, centroids):
    c = np.ones((len(X), 1), dtype= np.uint8)
    for i in range(len(X)):
        # Computing distance between training example and centroids
        dist = centroids - X[i]
        dist = dist ** 2
        dist = np.sum(dist, axis=1)
        # Finding closest Centroid
        argmin = np.argmin(dist)
        c[i] = argmin
    # Returning index of the closest centroids
    return c

def computeCentroids(X, idx, K):
    # Creating new centroids matrix
    centroids = np.zeros((K, X.shape[1]))
    for i in range(K):
        # Finding trainig sets with same class
        same_idx = (idx == i)
        # Computing mean of the points 
        centroids[i] = np.mean(X[same_idx], axis=0)
    # Returining new moved centroids
    return centroids

def runKMeans(X, initial_centroids, max_iters, plot_progress = False):
    #Initialize values
    m, n = X.shape
    K = len(initial_centroids)
    centroids = initial_centroids
    previous_centroids = centroids
    idx = np.zeros((m,1), dtype= np.uint8)
    # Run K-Means
    for i in range(max_iters):
        # Finding closest centroids
        idx = findClosestCentroids(X, centroids).T[0]
            plotProgressKMeans(X, centroids, previous_centroids, idx, K, i)
            previous_centroids = centroids
        # Given the memberships, compute new centroids
        centroids = computeCentroids(X, idx, K)
    return ( centroids, idx )

def plotDataPoints(X, idx, K):
    colors = np.zeros(( len(idx) , K))
    colors[ idx == 0 ] = [ 1,0,0 ]
    colors[ idx == 1 ] = [ 0,1,0 ]
    colors[ idx == 2 ] = [ 0,0,1 ]
    plt.scatter(X[:,0], X[:,1], s=60, c= colors)
def plotProgressKMeans(X, centroids, previous_centroids, idx, K, i):
    # Scattering Data Points
    plotDataPoints(X, idx, K)
    # Plotting new Centroids as "X"
    plt.plot(centroids[:,0], centroids[:,1], lw= 0 ,marker='X', c=[0,0,0], ms= 10)
    # Plotting Progress of Algorithm
    for i in range(K):
        # Draw line from point1 to point2
        drawLine(centroids[i, :], previous_centroids[i, :])
def drawLine(p1, p2):
    plt.plot([p1[0], p2[0]], [p1[1], p2[1]], color=[0,0,0], lw=3)
def kMeansInitCentroids(X, K):
    randIdx = np.random.permutation(K)
    return X[randIdx[0:K]]

Load The Data

# Load the Data
mat ="ex7data2.mat")
X = mat["X"]

Test "findClosestCentroids" , "computeMean" Functions

#Select an initial set of centroids
initial_centroids = np.array([[3, 3],
                             [6, 2],
                             [8, 5]])
K = 3

#Find the closest centroids for the examples using the initial_centroids
idx = findClosestCentroids(X,initial_centroids).T[0]
idx[0:3] + 1

array([1, 3, 2], dtype=uint8)

# Centroids computed after initial finding of closest centroids
computeCentroids(X, idx, K)

array([[ 2.42830111,  3.15792418],
       [ 5.81350331,  2.63365645],
       [ 7.11938687,  3.6166844 ]])

Test K-Means Algorithm

#Settings for running K-Means
K = 3
max_iter = 10

array([3, 3])

# Running K-Means function with inital_centroids

(array([[ 1.95399466,  5.02557006],
        [ 3.04367119,  1.01541041],
        [ 6.03366736,  3.00052511]]),
K-Means on Pixels

Load the Image

# Convert Real image to 3d RGB matrix
A = scipy.misc.imread("bird_small.png")

# Reshaping Matrix to 2D Matrix
A_2d = A.reshape( ( A.shape[0] * A.shape[1], A.shape[2] ) )

# Computing memory usage of the matrix
A.nbytes * 8


Running K-Means Algorithm

# Divide by 255 so that all values are in the range 0 - 1
A = A / 255
A_2d = A_2d / 255

# Initial Values of K and max_iters
K = 16; 
max_iters = 100;

# initialize the centroids
init_centroids = kMeansInitCentroids(A_2d, K)

# Running Algorithm
centroids, idx = runKMeans(A_2d, init_centroids ,max_iters)

print("New 16 Different Colors\n")
print("\nIndices of each pixel\n")

New 16 Different Colors

Indices of each pixel

# Recover image with new colors
X_recovered = np.array([centroids[i] for i in idx])
X_recovered = X_recoverd.reshape(A.shape)

fig = plt.figure()

ax1 = fig.add_axes([0,0,1,1])
ax2 = fig.add_axes([1,0,1,1])

ax1.set_title("Before Using K-Means Algorithm")

ax2.set_title("After Using K-Means Algorithm")

<matplotlib.text.Text at 0x7773fa6668>

Principal Component Analysis


def featureNormalize(X):
    X_norm  = X - np.mean(X,axis=0)
    X_norm = X_norm / X.std(axis=0)
    return (X_norm, np.mean(X,axis=0), X.std(axis=0))

def pca(X):
    m, n = X.shape
    sigma = ( 1 / m ) *,X)
    return np.linalg.svd(sigma)

def projectData(X, U, K):
    return, U[:,0:K])

def recoverData(Z, U, K):
    return, U[:,0:K].T)

Load the Data

mat ='ex7data1.mat')
X = mat['X']
m,n = X.shape

Plot the Data

plt.plot(X[:,0],X[:,1],data=X, lw=0, marker="o",ms = 10)

[<matplotlib.lines.Line2D at 0x7773d73f60>]

Use PCA Algorithm

# We should Feature normalize before using PCA Algorithm
X_norm, mu, sigma = featureNormalize(X)

# Finding eigenvectors using PCA algorithm
U, S, V = pca(X_norm)

# Draw the eigenvectors centered at mean of data.
drawLine(mu, mu + 1.5 * S[0] * U[:,0])
drawLine(mu, mu + 1.5 * S[1] * U[:,1])
plt.plot(X[:,0],X[:,1], lw=0, marker="o",ms = 10)

[<matplotlib.lines.Line2D at 0x7773ff65f8>]

# Plot the normalized dataset (returned from pca)
plt.plot(X_norm[:,0],X_norm[:,1], lw=0, marker="o",ms = 10)

[<matplotlib.lines.Line2D at 0x77768d0278>]

# Project data into K dimension
Z = projectData(X_norm, U, 1)

# Printing first 5 projected value

# Recovering Data
X_recovered = recoverData(Z, U, 1)

# Printing first 5 recoverd values

# Draw lines connecting the projected points to the original points

plt.plot(X_recovered[:,0], X_recovered[:,1],
         lw=0, markerfacecolor= "white", markeredgewidth=2 ,markeredgecolor="red" ,marker="o",ms = 10)
plt.plot(X_norm[:,0], X_norm[:,1],
         lw=0,markerfacecolor= "white", markeredgewidth=2 , markeredgecolor="blue", marker="o",ms = 10)

for i in range(m):
    drawLine(X_norm[i,:], X_recovered[i,:])

Face Dataset


def magic_display(matrix = None, cmap= 'gray'):
    if matrix is None:
        # selecting 100 random rows of the X
        rand_indces = np.random.permutation(m)[0:100]
        X_dis = X[rand_indces]
        X_dis = matrix
    if( len(X_dis.shape) > 1 ): 
        m_test,n_test = X_dis.shape
        axis_bound = 1
        m_test = 1
        n_test = X_dis.shape[0]
        axis_bound = 0
    # each number width , height in plot
    example_width = int(round(sqrt(n_test)))
    example_height = int(round( n_test / example_width ))

    # number of numbers to show in plot
    display_rows = floor(sqrt(m_test))
    display_cols = ceil(m_test / display_rows )

    # padding between numbers
    pad = 2

    # intilazation array for holding previos 100 random numbers
    display_array = np.ones((
                             pad + display_rows * ( example_height + pad ),
                             pad + display_cols * ( example_width + pad )
    count = 0;
    for i in range(display_cols):
        for j in range(display_rows):
            if( count >= m_test ):

            # max_val of each row in X_dis
            max_val = np.max( X_dis[count : count+1], axis= axis_bound)

            # Starting x,y point of numbers shape in array 
            ex_x_range = pad + ( j ) * ( example_height + pad )
            ex_y_range = pad + ( i ) * ( example_width + pad )
            if(m_test > 1):
                ex_arr = X_dis[ count : count + 1 , 0:].reshape(example_height , example_width)
                ex_arr = X_dis[1:].reshape(example_height , example_width)
            # Setting values
            display_array[ ex_y_range : ex_y_range + example_width,
                         ex_x_range : ex_x_range + example_height] = np.divide(ex_arr , max_val).T
            count += 1

    # Get rod of grid
    plt.imshow(display_array, cmap= cmap)

Load the Data

mat ="ex7faces.mat")
X = mat['X']

# One example of X

Display Face

In [642]:

PCA Algorithm

X_norm, mu, sigma = featureNormalize(X)

# Finding eigenvectors using PCA algorithm
U, S, V = pca(X_norm)

# Visualize the top 36 eigenvectors found
magic_display(U[:,0:36].T, cmap= 'gist_ncar')

# Dimension Reduction for Faces
K = 150
Z = projectData(X_norm, U, K)

# Visualization of Faces after PCA Dimension Reduction
K = 150
X_rec = recoverData(Z, U, K)

plt.subplot(1,2,1).set_title("Original faces")

plt.subplot(1,2,2).set_title("Recovered faces")

